"""Run basic posterior predictive checks."""
import argparse

import numpy as np
import pandas as pd

import datasets
import inference


def dataset():
    """Assembles all dataframes for table."""
    raven_forster = datasets.raven_forster_1789_1799()
    novels_1800_1836 = datasets.garside_schöwerling_title_counts()
    novels_1800_1829_by_gender = datasets.garside_schöwerling_title_counts_by_gender()
    df = pd.DataFrame(dict(novels=pd.concat([raven_forster, novels_1800_1836])))
    df = df.join(novels_1800_1829_by_gender)

    # pad dataframe to 1919
    df = pd.concat([df, pd.DataFrame(dict(novels=float('nan')), index=range(1837, 1919 + 1))])
    df = df.join(datasets.publishers_circular())
    df = df.join(datasets.andrew_block())

    # add nstc
    df = df.join(datasets.nineteenth_century_short_title_catalogue_loced())

    # add bassett priors
    bassett_discounted = datasets.bassett_at_the_circulating_library_priors_discounted()
    df = df.join(bassett_discounted[['bassett_25_percentile', 'bassett_50_percentile', 'bassett_75_percentile']])

    # add ellen miller casey athenaeum counts
    df = df.join(datasets.casey_athenaeum_novels())

    # add population estimates
    df = df.join(datasets._population_british_isles())

    return df


def posterior_predictive_check_casey():
    """Make table showing number of new titles published by year and gender."""
    fit_extract = inference.sampling()
    df = dataset().copy()

    # check predicted interval for each casey year
    casey_pct = fit_extract['pct_casey_novels']
    for gender in ['men', 'women', 'unknown_gender']:
        model_unknown = pd.DataFrame(fit_extract[f'y_{gender.replace("_gender", "")}_sim'].T, index=range(1800, 1919 + 1))
        for year, row in datasets.casey_athenaeum_novels_by_gender().iterrows():
            observed = row[f'athenaeum_novels_reviewed_{gender}']
            assert model_unknown.loc[year].values.shape == casey_pct.shape, (model_unknown.loc[year].values.shape, casey_pct.shape)  # noqa
            model_casey_unknown = model_unknown.loc[year].values * casey_pct
            model_lower, model_upper = np.percentile(model_casey_unknown, [5, 95])
            print(f'casey ppc: year {year} observed authors with gender `{gender}` in casey {observed}, '
                f'model 90% CI: {model_lower}-{model_upper}')
            assert model_lower < observed < model_upper


if __name__ == '__main__':
    print('running posterior predictive checks...')
    posterior_predictive_check_casey()
    print('done')
